# 提取祥子相关对话（需人工校验）
import re
from ltp import LTP

ltp = LTP()
with open("luotuoxiangzi.txt") as f:
    text = f.read()

# 分段落处理
paragraphs = [p for p in text.split("\n") if len(p) > 10]

# 核心角色对话提取
xiangzi_dialogues = []
for para in paragraphs:
    # 使用语义角色标注(SRL)定位说话人
    srl_result = ltp.srl(para)
    for verb in srl_result:
        if "说" in verb['predicate'] and "A0" in verb['arguments']:
            speaker = verb['arguments']['A0']
            if "祥子" in speaker:
                dialogue = re.findall(r"“(.*?)”", para)
                xiangzi_dialogues.extend(dialogue)

# 获得约850-1200条祥子核心对话（老舍原文中祥子台词约900句）